{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### CADD\n",
    "\n",
    "Use to create CADD Hail Tables after downloading raw data from https://cadd.gs.washington.edu/ with Hail Batch (see `datasets/extract/extract_CADD.py`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import hail as hl\n",
    "hl.init()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_root = \"gs://hail-datasets-tmp\"\n",
    "output_root = \"gs://hail-datasets-us\"\n",
    "\n",
    "name = \"CADD\"\n",
    "version = \"v1.6\"\n",
    "builds = [\"GRCh37\", \"GRCh38\"]\n",
    "\n",
    "for build in builds:\n",
    "    ht = hl.import_table(f\"{input_root}/{name}/{name}_{version}_{build}.tsv.bgz\",\n",
    "                         min_partitions=2048,\n",
    "                         types={\"position\": hl.tint,\n",
    "                                \"raw_score\": hl.tfloat,\n",
    "                                \"PHRED_score\": hl.tfloat})\n",
    "\n",
    "    if build == \"GRCh37\":\n",
    "        ht = ht.annotate(locus = hl.locus(ht.chromosome, ht.position, build))\n",
    "    else:\n",
    "        ht = ht.annotate(locus = hl.locus(\"chr\" + ht.chromosome, ht.position, build))\n",
    "\n",
    "    ht = ht.annotate(alleles = [ht.ref, ht.alt])\n",
    "    ht = ht.select(\"locus\", \"alleles\", \"raw_score\", \"PHRED_score\")\n",
    "    ht = ht.key_by(\"locus\", \"alleles\")\n",
    "    \n",
    "    n_rows = ht.count()\n",
    "    n_partitions = ht.n_partitions()\n",
    "    \n",
    "    ht = ht.annotate_globals(\n",
    "        metadata=hl.struct(\n",
    "            name=name,\n",
    "            version=version,\n",
    "            reference_genome=build,\n",
    "            n_rows=n_rows,\n",
    "            n_partitions=n_partitions\n",
    "        )\n",
    "    )\n",
    "    \n",
    "    ht.write(f\"{output_root}/{name}_{version}_{build}.ht\")\n",
    "    ht = hl.read_table(f\"{output_root}/{name}_{version}_{build}.ht\")\n",
    "    ht.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ht37 = hl.read_table(\"gs://hail-datasets-us/CADD_v1.6_GRCh37.ht\")\n",
    "ht37.describe()\n",
    "print(f\"GRCh37: {str(hl.eval(ht37.metadata))}\")\n",
    "ht37.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ht38 = hl.read_table(\"gs://hail-datasets-us/CADD_v1.6_GRCh38.ht\")\n",
    "ht38.describe()\n",
    "print(f\"GRCh38: {str(hl.eval(ht38.metadata))}\")\n",
    "ht38.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
